{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import lightgbm as lgb\n",
    "from sklearn.externals import joblib\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import ipywidgets as ipy\n",
    "from random import sample\n",
    "import shap\n",
    "shap.initjs()\n",
    "%matplotlib inline\n",
    "plt.style.use('seaborn-whitegrid')\n",
    "\n",
    "DIR = 'PATH/TO/YOUR/DATA'\n",
    "EXPERIMENT_DIR = 'PATH/TO/YOUR/EXPERIMENT'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "solution_path = 'solution_6_best_stacking'\n",
    "\n",
    "application = pd.read_csv(os.path.join(DIR,'files/unzipped_data/application_train.csv'))\n",
    "oof_train = pd.read_csv(os.path.join(EXPERIMENT_DIR,'{}/lightGBM_stacking_with_features_out_of_fold_train_predictions.csv'.format(solution_path)))\n",
    "model = joblib.load(os.path.join(EXPERIMENT_DIR,'{}/transformers/light_gbm_fold_0'.format(solution_path)))\n",
    "features = joblib.load(os.path.join(EXPERIMENT_DIR,'{}/outputs/feature_concat_fold_0'.format(solution_path)))\n",
    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'), encoding='latin1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index_list = oof_train[oof_train.fold_id==0]['SK_ID_CURR']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "oof_train_0 = oof_train[oof_train['SK_ID_CURR'].isin(index_list)]\n",
    "application_0 = application[application['SK_ID_CURR'].isin(index_list)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "oof_train_0 = oof_train[oof_train['SK_ID_CURR'].isin(index_list)]\n",
    "application_0 = application[application['SK_ID_CURR'].isin(index_list)]\n",
    "features_df = features['features']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "exploration_data = oof_train_0.copy()\n",
    "exploration_data = exploration_data.groupby(['SK_ID_CURR']).mean().reset_index()\n",
    "exploration_data['target'] = application_0['TARGET'].values\n",
    "exploration_data['diff_abs'] = np.abs(exploration_data['lightGBM_stacking_with_features_prediction'] - exploration_data['target'])\n",
    "exploration_data['diff'] = exploration_data['lightGBM_stacking_with_features_prediction'] - exploration_data['target']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Features description"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Description of features(Only from kaggle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@ipy.interact(\n",
    "    search = ipy.Text(\n",
    "        description='Search',\n",
    "    )\n",
    ")\n",
    "def gunc(search):\n",
    "    if len(search) > 0:\n",
    "        mask = list(map(lambda x: x.find(search.lower())!=-1, description['Row'].str.lower().values))\n",
    "        rows = description[mask]['Row']\n",
    "        if len(rows) == 0:\n",
    "            rows = description.Row\n",
    "            selected = rows[0]\n",
    "        if len(rows) > 1:\n",
    "            selected = rows.values[0]\n",
    "        elif len(rows) == 1:\n",
    "            selected = rows.item()\n",
    "    else:\n",
    "        rows = description.Row\n",
    "        selected = rows[0]\n",
    "    @ipy.interact(\n",
    "            cols = ipy.SelectMultiple(\n",
    "                options=rows,\n",
    "                rows=10,\n",
    "                value=(selected,),\n",
    "                description='Features',\n",
    "                layout=ipy.Layout(width='90%')\n",
    "            )\n",
    "    )\n",
    "    def func(cols):\n",
    "        for i, col in enumerate(cols):\n",
    "            display('{} --- {}'.format(col, description[description.Row==col]['Description'].values[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Shap - Feature impact on a model\n",
    "\n",
    "https://github.com/slundberg/shap \n",
    "\n",
    "https://arxiv.org/pdf/1802.03888.pdf \n",
    "\n",
    "http://papers.nips.cc/paper/7062-a-unified-approach-to-interpreting-model-predictions.pdf "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our first step is to compute SHAP values for each example in our dataset. $base\\_value$ is mean of our predictions and in our dataset equals $0.04942$ and will be flagged on plots below. The Shap value tells us how certain feature moved our prediction on ceratin example from expected value of all predictions.\n",
    "$$ output\\_value (x) = base\\_value + \\sum\\limits_{i=1}^{M} \\phi_{i}z_{i}(x) $$,\n",
    "\n",
    "where $ z_i(x) \\in \\{0, 1\\}$ describes if $i$-th feature-value(e.g. SEX='Male') occurs at example $x$ and $\\phi_i$ is SHAP value of given feature."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "shap_values = shap.TreeExplainer(model).shap_values(features_df)\n",
    "global_shap_vals = np.abs(shap_values).mean(0)[:-1]\n",
    "inds = np.argsort(global_shap_vals)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display(shap.force_plot(shap_values[0,:], features_df.iloc[0,:], link=\"logit\"))\n",
    "display(shap.force_plot(shap_values[1,:], features_df.iloc[1,:], link=\"logit\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature importance"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Our metric will be mean of SHAP's absolute values, which tell how much each feature is moving up or down predictions of our model from $base\\_value$. Features are sorted from most to least important."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@ipy.interact(cols=ipy.IntRangeSlider(\n",
    "                value=(1, 20),\n",
    "                min=1,\n",
    "                max=features_df.shape[1],\n",
    "                description='Features:',\n",
    "                continuous_update=False,\n",
    "                layout=ipy.Layout(width='90%', height='30px')\n",
    "            ))\n",
    "def func(cols):\n",
    "    min_index = -cols[1]\n",
    "    max_index = -cols[0]\n",
    "    y_pos = np.arange(features_df.shape[1])\n",
    "    plt.title(\"Feature importance: mean(|SHAP|)\")\n",
    "    plt.barh(y_pos[min_index:max_index], global_shap_vals[inds][min_index:max_index], color=\"#1E88E5\")\n",
    "    plt.yticks(y_pos[min_index:max_index], features_df.columns[inds][min_index:max_index])\n",
    "    plt.gca().spines['right'].set_visible(False)\n",
    "    plt.gca().spines['top'].set_visible(False)\n",
    "    plt.xlabel(\"mean SHAP value magnitude (change in log odds)\")\n",
    "    plt.gcf().set_size_inches(11, (cols[1]-cols[0])//2)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### The plot below describes how top-20 features are changing the model predictions in dependence of their values. \n",
    "    * On x-axis there are SHAP values\n",
    "    * Each row is corresponding to some feature\n",
    "    * Each point is corresponding to some example in dataset\n",
    "    * Each point has color corresponding to him feature value(HIGH ~ Red, LOW ~ Blue)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display(shap.summary_plot(shap_values, features_df, max_display=20))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Relation between SHAP and feature values and coloring with values of another correlated feature."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "@ipy.interact(\n",
    "    search = ipy.Text(\n",
    "        description='Search',\n",
    "    )\n",
    ")\n",
    "def gunc(search):\n",
    "    lst = list(reversed(features_df.columns[inds]))\n",
    "    \n",
    "    if len(search) > 0:\n",
    "        mask = list(map(lambda x: x.find(search.lower())!=-1, map(lambda x: x.lower(), lst)))\n",
    "        rows = [item for i, item in enumerate(lst) if mask[i]]\n",
    "        if len(rows) == 0:\n",
    "            rows = description.Row\n",
    "            selected = rows[0]\n",
    "        selected = rows[0]\n",
    "    else:\n",
    "        rows = lst\n",
    "        selected = lst[0]\n",
    "    @ipy.interact(\n",
    "                cols = ipy.SelectMultiple(\n",
    "                    options=rows,\n",
    "                    rows=10,\n",
    "                    value=(selected, ),\n",
    "                    description='Corr Columns',\n",
    "                    layout=ipy.Layout(width='90%')\n",
    "                ),\n",
    "                num_samples=ipy.IntSlider(\n",
    "                    value=250,\n",
    "                    min=100,\n",
    "                    max=1000,\n",
    "                    step = 50,\n",
    "                    continuous_update=False,\n",
    "                    description='Samples:',\n",
    "                    layout=ipy.Layout(width='90%', height='30px')\n",
    "                )\n",
    "    )\n",
    "    def func(cols, num_samples):\n",
    "        smp = sample(range(len(shap_values)), num_samples)\n",
    "        for col in cols:\n",
    "            display(shap.dependence_plot(col, shap_values[smp], features_df.loc[smp,:]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(17,5))\n",
    "\n",
    "plt.title(\"Distribution of |Difference between predictions and target|\")\n",
    "sns.distplot(exploration_data[exploration_data['target']==0]['diff_abs'], \n",
    "             label='Target_0', \n",
    "             color='#1587E8',\n",
    "             hist_kws={'alpha': 0.8},\n",
    "             bins=100);\n",
    "sns.distplot(exploration_data[exploration_data['target']==1]['diff_abs'], \n",
    "             label='Target_1',\n",
    "             color='#F02958',\n",
    "             hist_kws={'alpha': 0.8},\n",
    "             bins=100);\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_analysis = features_df.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_analysis['INDEX'] = index_list\n",
    "feature_analysis['TARGET'] = exploration_data['target']\n",
    "feature_analysis['DIFF'] = exploration_data['diff']\n",
    "feature_analysis['DIFF_ABS'] = exploration_data['diff_abs']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Distributions of choosen features in dependence of target value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "@ipy.interact(\n",
    "    search = ipy.Text(\n",
    "        description='Search',\n",
    "    )\n",
    ")\n",
    "def gunc(search):\n",
    "    lst = list(reversed(features_df.columns[inds]))\n",
    "    \n",
    "    if len(search) > 0:\n",
    "        mask = list(map(lambda x: x.find(search.lower())!=-1, map(lambda x: x.lower(), lst)))\n",
    "        rows = [item for i, item in enumerate(lst) if mask[i]]\n",
    "        if len(rows) == 0:\n",
    "            rows = description.Row\n",
    "            selected = rows[0]\n",
    "        selected = rows[0]\n",
    "    else:\n",
    "        rows = lst\n",
    "        selected = lst[0]\n",
    "    @ipy.interact(\n",
    "                cols = ipy.SelectMultiple(\n",
    "                    options=rows,\n",
    "                    rows=10,\n",
    "                    value=(selected, ),\n",
    "                    description='Corr Columns',\n",
    "                    layout=ipy.Layout(width='90%')\n",
    "                ),\n",
    "                num_samples=ipy.IntSlider(\n",
    "                    value=250,\n",
    "                    min=100,\n",
    "                    max=1000,\n",
    "                    step = 50,\n",
    "                    continuous_update=False,\n",
    "                    description='Samples:',\n",
    "                    layout=ipy.Layout(width='90%', height='30px')\n",
    "                )\n",
    "    )\n",
    "    def func(cols, num_samples):\n",
    "        for col in cols:\n",
    "            fig = plt.figure(figsize=(16, 8));\n",
    "            target_1 = feature_analysis[feature_analysis['TARGET']==1]\n",
    "            target_0 = feature_analysis[feature_analysis['TARGET']==0]\n",
    "            smp_1 = sample(range(len(target_1)), min(num_samples, len(target_1)))\n",
    "            smp_0 = sample(range(len(target_0)), min(num_samples, len(target_0)))\n",
    "            df = pd.concat([target_1.iloc[smp_1,:], target_0.iloc[smp_0,:]], axis=0)\n",
    "            display(sns.swarmplot(x='TARGET', y=col, data=df, palette=['#1587E8', '#F02958'],))\n",
    "            plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Distributions of choosen features in dependence of absolute difference between predictions and target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_between(df, col, interval):\n",
    "    return df[(interval[0] <= df[col]) & (df[col] <= interval[1])]\n",
    "\n",
    "def col_without_nan(df, col):\n",
    "    val = df[col]\n",
    "    return val[~np.isnan(val)]\n",
    "\n",
    "@ipy.interact(\n",
    "    search = ipy.Text(\n",
    "        description='Search',\n",
    "        continuous_update=False\n",
    "    )\n",
    ")\n",
    "def gunc(search):\n",
    "    lst = list(reversed(features_df.columns[inds]))\n",
    "    \n",
    "    if len(search) > 0:\n",
    "        mask = list(map(lambda x: x.find(search.lower())!=-1, map(lambda x: x.lower(), lst)))\n",
    "        rows = [item for i, item in enumerate(lst) if mask[i]]\n",
    "        if len(rows) == 0:\n",
    "            rows = description.Row\n",
    "            selected = rows[0]\n",
    "        selected = rows[0]\n",
    "    else:\n",
    "        rows = lst\n",
    "        selected = lst[0]\n",
    "    @ipy.interact(\n",
    "                diff_1=ipy.FloatRangeSlider(\n",
    "                    value=(0.5, 1.0),\n",
    "                    min=0.0,\n",
    "                    max=1.0,\n",
    "                    step=0.01,\n",
    "                    description='Difference_1:',\n",
    "                    continuous_update=False,\n",
    "                    layout=ipy.Layout(width='90%', height='30px')\n",
    "                ),  \n",
    "                diff_2=ipy.FloatRangeSlider(\n",
    "                    value=(0.0, 0.5),\n",
    "                    min=0.0,\n",
    "                    max=1.0,\n",
    "                    step=0.01,\n",
    "                    description='Difference_2:',\n",
    "                    continuous_update=False,\n",
    "                    layout=ipy.Layout(width='90%', height='30px')\n",
    "                ),\n",
    "                cols = ipy.SelectMultiple(\n",
    "                    options=rows,\n",
    "                    rows=10,\n",
    "                    value=(selected, ),\n",
    "                    description='Columns',\n",
    "                    layout=ipy.Layout(width='90%')\n",
    "                ))\n",
    "    def func(diff_1, diff_2, cols):\n",
    "        for col in cols:\n",
    "            vals_1 = col_without_nan(get_between(feature_analysis, 'DIFF_ABS', diff_1) , col)\n",
    "            vals_2 = col_without_nan(get_between(feature_analysis, 'DIFF_ABS', diff_2) , col)\n",
    "            display(sns.distplot(vals_1, label='Difference_1', color='#F02958', hist_kws={'alpha': 0.7}, bins=min(100, len(vals_1))))\n",
    "            display(sns.distplot(vals_2, label='Difference_2', color='#1587E8', hist_kws={'alpha': 0.7}, bins=min(100, len(vals_2))))\n",
    "            plt.legend()\n",
    "            plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
